This is an machine learning uni project in the Universität Potsdam.
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import StratifiedShuffleSplit
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#from jupyterthemes import jtplot
#jtplot.style('chesterish',grid=False)
# load data
#df = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv')
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
Info = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.info.csv', sep='\t')
print Info
#add a header row to the training data (C1-C86); define the first 3/4 training data as our Train data;
Train = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv',
sep='\t', names = ["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD",
"C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
"C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
"C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD",
"C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC",
"C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
"C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
"C42MINKGEM","C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND", "C86CARAVAN"])
#add a header row to the training data (C1-C85); define the last 1/4 training data as our Test data;
#Test = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.train.csv', sep='\t', names = ["C1_MOSTYPE", "C2_MAANTHUI", "C3_MGEMOMV","C4_MGEMLEEF", "C5_MOSHOOFD", "C6_MGODRK","C7_MGODPR", "C8_MGODOV", "C9_MGODGE","C10_MRELGE", "C11_MRELSA", "C12_MRELOV","C13_MFALLEEN", "C14_MFGEKIND", "C15_MFWEKIND","C16_MOPLHOOG", "C17_MOPLMIDD", "C18_MOPLLAAG","C19_MBERHOOG", "C20_MBERZELF", "C21_MBERBOER","C22_MBERMIDD", "C23_MBERARBG", "C24_MBERARBO","C25_MSKA", "C26_MSKB1", "C27_MSKB2","C28_MSKC", "C29_MSKD", "C30_MHHUUR","C31_MHKOOP", "C32_MAUT1", "C33_MAUT2","C34_ MAUT0", "C35_MZFONDS", "C36_MZPART","C37_MINKM30", "C38_MINK3045", "C39_MINK4575","C40_MINK7512", "C41_MINK123M", "C42_MINKGEM","C43_MKOOPKLA", "C44_PWAPART", "C45_PWABEDR","C46_PWALAND", "C47_PPERSAUT", "C48_PBESAUT","C49_PMOTSCO", "C50_PVRAAUT", "C51_PAANHANG","C52_PTRACTOR", "C53_PWERKT", "C54_PBROM","C55_PLEVEN", "C56_PPERSONG", "C57_PGEZONG","C58_PWAOREG", "C59_PBRAND", "C60_PZEILPL","C61_PPLEZIER", "C62_PFIETS", "C63_PINBOED","C64_PBYSTAND", "C65_AWAPART", "C66_AWABEDR","C67_AWALAND","C68_APERSAUT", "C69_ABESAUT", "C70_AMOTSCO","C71_AVRAAUT", "C72_AAANHANG", "C73_ATRACTOR","C74_AWERKT", "C75_ABROM", "C76_ALEVEN","C77_APERSONG", "C78_AGEZONG", "C79_AWAOREG","C80_ABRAND","C81_AZEILPL", "C82_APLEZIER", "C83_AFIETS","C84_AINBOED", "C85_ABYSTAND", "C86_CARAVAN"],skiprows=4366, nrows=1456)
#The orginal caravan.test.csv was renamed by me to caravan.output.csv
Output = pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.output.csv', sep='\t', names = [
"C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD",
"C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
"C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
"C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD",
"C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC",
"C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
"C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
"C42MINKGEM","C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND", "C86CARAVAN"])
OutputdropM = Output[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND"]]
OutputselecdropM = Output[["C47PPERSAUT","C44PWAPART", "C61PPLEZIER", "C43MKOOPKLA","C59PBRAND","C64PBYSTAND","C58PWAOREG","C62PFIETS"]]
OutputdropM2 = Output[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND"]]
#view the 5 rows of the raw data in the training data and the testing data
print "first 5 rows of Train are:\n", Train.head()
#print "first 5 rows of Test are:\n", Test.head()
#view the raw data of Train and Test
print "Train:\n" , Train
#print "Test:\n", Test
#print "TraindropM:\n", TraindropM
#scaler = preprocessing.StandardScaler().fit(Train)
If there is any missing values, then those missing values should be handled. But it turns out that there is no missing values.
#check if there is null value in the raw training data and testing data or not:
print "sum of Null value in Train:", Train.isnull().sum().sum()
Train.info()
Output.info()
Religion6_9 = Train[['C6MGODRK','C7MGODPR', 'C8MGODOV', 'C9MGODGE']]
Relationship10_13 = Train[["C10MRELGE", "C11MRELSA", "C12MRELOV","C13MFALLEEN"]]
Children14_15 = Train[["C14MFGEKIND", "C15MFWEKIND"]]
Education16_18 = Train[["C16MOPLHOOG", "C17MOPLMIDD", "C18MOPLLAAG"]]
Job19_24 = Train[["C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD",
"C23MBERARBG", "C24MBERARBO"]]
SocialClass24_29 = Train[["C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC",
"C29MSKD"]]
HauseOwnership30_31 = Train[["C30MHHUUR","C31MHKOOP"]]
CarNumber32_34 = Train[["C32MAUT1", "C33MAUT2","C34MAUT0"]]
Income37_41 = Train[["C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M"]]
print Religion6_9 ,'\n', Relationship10_13, '\n', Children14_15,'\n', Education16_18, '\n',Job19_24,'\n',SocialClass24_29,'\n',HauseOwnership30_31,'\n',CarNumber32_34,'\n',Income37_41,'\n',Train[['C43MKOOPKLA']],'\n',Train[['C42MINKGEM']]
##Todo: Feature Representaition: Transform attributes, depending of model structure For instance, linear model compute inner product of attributes and model parameters. All attributes have to be numeric. Larger attribute values: larger value of inner product Categorial attributes, attributes without ordering, textual attributes have to be converted.
Train.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Train[["C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART", "C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND"]].corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
It is clear that our dataset is highly unbalanced with only 6.36% of observations actually buying the insurance.
from __future__ import division
#y_Train_original.value_counts().plot(kind='bar', title='Classifying CARAVAN 2', color='steelblue', grid=True)
#print "y_Train: \n", y_Train_original.value_counts()
#print "\n"
#y_TestdropM2.value_counts().plot(kind='bar', title='Classifying CARAVAN', color='steelblue', grid=True)
#print "y_Test: \n",y_Test.value_counts()
#print "\n"
X = Train.drop(['C86CARAVAN'], axis=1)
y = Train['C86CARAVAN']
plt.subplot(1,1,1)
y.value_counts().plot(kind='bar', title='Classifying CARAVAN: all Data', color='steelblue', grid=True)
print "y: \n",y.value_counts()
print("Caravan Ratio: {:.2%}".format(348/5474))
#print float(format(), .02%)
Determination of target customers for Caravan insurance. This is a Cross selling problem: Cross-selling is the action or practice of selling an additional product or service to an existing customer (https://en.wikipedia.org/wiki/Cross-selling).
# Apart from 'Purchasing Power Class', all sociodemographic variables derived from zip codeswere discarded, because they did not add predictive power to the model.except for "C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD","C42MINKGEM","C43MKOOPKLA"
TraindropM = Train[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND", "C86CARAVAN"]]
print "first 5 rows of TraindropM are:\n", TraindropM.head()
print TraindropM.describe()
TraindropM2 = Train[["C43MKOOPKLA", "C44PWAPART", "C45PWABEDR","C46PWALAND", "C47PPERSAUT",
"C48PBESAUT","C49PMOTSCO", "C50PVRAAUT", "C51PAANHANG","C52PTRACTOR", "C53PWERKT",
"C54PBROM","C55PLEVEN", "C56PPERSONG", "C57PGEZONG","C58PWAOREG", "C59PBRAND",
"C60PZEILPL","C61PPLEZIER", "C62PFIETS", "C63PINBOED","C64PBYSTAND", "C86CARAVAN"]]
print "first 5 rows of TraindropM are:\n", TraindropM.head()
# Feature size
XdropM = TraindropM.drop(['C86CARAVAN'], axis=1)
#ydropM = TraindropM['C86CARAVAN']
X_Train_originaldropM,X_TestdropM,y_Train_originaldropM,y_TestdropM= train_test_split(XdropM, y, test_size=0.3,random_state=42)
print('X and y Input Data: ', XdropM.shape, y.shape)
print('Train Set Shape: ', X_Train_originaldropM.shape, y_Train_originaldropM.shape)
print('Test Set Shape: ', X_TestdropM.shape, y_TestdropM.shape)
#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
#print "\n\n"
#print "The description of features of trian: \n \n", X_Train_originaldropM.describe()
#print "\n\n"
#print "The description of y of trian: \n \n", y_Train_originaldropM.describe()
#print "\n\n"
#print "The description of features of test:\n \n", X_TestdropM.describe()
#print "\n\n"
#print "The description of y of trian: \n \n", y_TestdropM.describe()
X_TraindropM2 = TraindropM2.drop(['C86CARAVAN'], axis=1)
y_TraindropM2 = TraindropM2['C86CARAVAN']
#scalerT = preprocessing.StandardScaler().fit(Train)
#min_max_scaler = preprocessing.MinMaxScaler()
#Train = scaler.fit_transform(Train)
#print Train
#scaler = preprocessing.StandardScaler().fit(X)
#min_max_scaler = preprocessing.MinMaxScaler()
#X = scaler.fit_transform(X)
#
#print X
X_Train_original,X_TestdropM2,y_Train_original,y_TestdropM2= train_test_split(X_TraindropM2, y_TraindropM2, test_size=0.3,random_state=42)
print('X and y Input Data: ', X_TraindropM2.shape, y_TraindropM2.shape)
print('Test Set Shape: ', X_Train_original.shape, y_Train_original.shape)
print('Test Set Shape: ', X_TestdropM2.shape, y_TestdropM2.shape)
#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
print "\n\n"
print "The description of features of trian: \n \n", X_Train_original.describe()
print "\n\n"
#print "The description of y of trian: \n \n", y_Train_original.describe()
print "\n\n"
#print "The description of features of test:\n \n", X_Test.describe()
print "\n\n"
#print "The description of y of trian: \n \n", y_Test.describe()
#fig = plt.figure(figsize=(10,10))
# Tells the total count of different values in CARAVAN
from imblearn.over_sampling import SMOTE
doOversampling = True
if doOversampling:
# Apply regular SMOTE
sm = SMOTE(kind='regular')
X_TraindropM, y_TraindropM = sm.fit_sample(X_Train_originaldropM, y_Train_originaldropM)
print('Training Set Shape after oversampling: ', X_TraindropM.shape, y_TraindropM.shape)
print(pd.crosstab(y_TraindropM,y_TraindropM))
else:
X_TraindropM = X_Train_originaldropM
y_TraindropM = y_Train_originaldropM
doOversampling2 = True
if doOversampling2:
# Apply regular SMOTE
sm = SMOTE(kind='regular')
X_TraindropM2, y_TraindropM2 = sm.fit_sample(X_Train_original, y_Train_original)
print('Training Set Shape after oversampling: ', X_TraindropM2.shape, y_TraindropM2.shape)
print(pd.crosstab(y_TraindropM2,y_TraindropM2))
else:
X_TraindropM2 = X_Train_original
y_TraindropM2 = y_Train_original
##Transforms features by scaling each feature to a given range
from statsmodels.stats import anova
import statsmodels.api as sm
from statsmodels.formula.api import ols
mtmodel1 = ols('C86CARAVAN ~ C43MKOOPKLA + C44PWAPART + C45PWABEDR + C46PWALAND + C47PPERSAUT + C48PBESAUT + C49PMOTSCO + C50PVRAAUT + C51PAANHANG + C52PTRACTOR + C53PWERKT + C54PBROM + C55PLEVEN + C56PPERSONG + C57PGEZONG + C58PWAOREG + C59PBRAND + C60PZEILPL + C61PPLEZIER + C62PFIETS + C63PINBOED + C64PBYSTAND ', Train).fit()
#Anova table for one or more fitted linear models.Single factor analysis, prediction power : http://www.statisticshowto.com/probability-and-statistics/f-statistic-value-test/ # http://www.statisticshowto.com/support-or-reject-null-hypothesis/
# An F statistic is a value you get when you run an ANOVA test or a regression analysis to find out if the means between two populations are significantly different.
anovatable = sm.stats.anova_lm(mtmodel1)
anovatable_sorted = anovatable.sort_values(['df', 'sum_sq', 'mean_sq','F','PR(>F)'], ascending = [False,False, False, True,False])
anovatable_sorted.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
#print(anova.anova_lm(mt_model1))
TrainselecdropM
#Xselec = (Train[Train.columns[[47,44,61,1,16,10,82,59]]].values)
#print Xselec
#select top 8 features to avoid overfit
TrainselecdropM = TraindropM2[["C47PPERSAUT","C44PWAPART", "C61PPLEZIER", "C43MKOOPKLA","C59PBRAND","C64PBYSTAND","C58PWAOREG","C62PFIETS"]]
print (TrainselecdropM.describe())
TrainselecdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
XselecdropM = TrainselecdropM
#yselecdropM = TraindropM['C86CARAVAN']
X_TrainselecdropM,X_TestselecdropM,y_TrainselecdropM,y_TestselecdropM= train_test_split(XselecdropM, y, test_size=0.3,random_state=42)
print('X and y Input Data: ', XselecdropM.shape, y.shape)
print('Train Set Shape: ', X_TrainselecdropM.shape, y_TrainselecdropM.shape)
print('Test Set Shape: ', X_TestselecdropM.shape, y_TestselecdropM.shape)
#y_Train = Train['C86_CARAVAN']
#y_Test = Test['C86_CARAVAN']
#X_Train = Train.drop(['C86_CARAVAN'], axis=1)
#X_Test = Test.drop(['C86_CARAVAN'], axis=1)
#print "\n\n"
#print "The description of features of trian with top 8 selected features: \n \n", X_TrainselecdropM.describe()
#print "\n\n"
#print "The description of y of trian with top 8 selected features: \n \n", y_TrainselecdropM.describe()
#print "\n\n"
#print "The description of features of test with top 8 selected features:\n \n", X_TestselecdropM.describe()
#print "\n\n"
#print "The description of y of trian with top 8 selected features: \n \n", y_TestselecdropM.describe()
TrainselecdropM.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)
Train, test and evaluate the Random Forest Classifier with the Train data after droping the most sociodemographic variables derived from zip codes
We choose to use TraindropM2 to evaluate the model, since this data set are the cleanst of the three canditate data sets.
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
clf2.fit(X_TraindropM2, y_TraindropM2)
test_predictions2 = clf2.predict(X_TestdropM2)
#define the Colorcodes
CBLACK = '\33[30m'
CRED = '\33[31m'
CGREEN = '\33[32m'
CYELLOW = '\33[33m'
CBLUE = '\33[34m'
CVIOLET = '\33[35m'
CBEIGE = '\33[36m'
CWHITE = '\33[37m'
CBLACKBG = '\33[40m'
CREDBG = '\33[41m'
CGREENBG = '\33[42m'
CYELLOWBG = '\33[43m'
CBLUEBG = '\33[44m'
CVIOLETBG = '\33[45m'
CBEIGEBG = '\33[46m'
CWHITEBG = '\33[47m'
CGREY = '\33[90m'
CEND = '\033[0m'
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score,f1_score,accuracy_score,log_loss,confusion_matrix
from __future__ import division
print('****ResultsselecdropM****')
print('\n----------------Unhelpful Scores\n')
test_predictionsdropMR = clf2.predict(X_TestdropM2)
accdropMR = f1_score(y_TestdropM2, test_predictionsdropMR)
print("F-scoreselecdropM: {:.2%}".format(accdropMR))
test_predictionsdropMR = clf2.predict(X_TestdropM2)
acc2dropMR = accuracy_score(y_TestdropM2, test_predictionsdropMR)
print('Model accuracyselecdropM: {:.2%} '.format(acc2dropMR))
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
print('ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): \nIf both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.')
RocScoreR=roc_auc_score(y_TestdropM2, clf2.predict(X_TestdropM2))
fprBR, tprBR, thresholdsBR = roc_curve(y_TestdropM2, clf2.predict_proba(X_TestdropM2)[:,1])
RocScoreTrainR=roc_auc_score(y_TraindropM2, clf2.predict(X_TraindropM2))
fprBTrainR, tprBTrainR, thresholdsBTrainR = roc_curve(y_TraindropM2, clf2.predict_proba(X_TraindropM2)[:,1])
print RocScoreR
print RocScoreTrainR
plt.figure()
plt.plot(fprBR, tprBR, label='classifiersTest' % RocScoreR)
plt.plot(fprBTrainR, tprBTrainR, label='classifiersTrain' % RocScoreTrainR)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
test_predictionsdropMR = clf2.predict_proba(X_TestdropM2)
llselecdropMR = log_loss(y_TestdropM2, test_predictionsdropMR)
print("Log LossselecdropM: {:.2f}".format(llselecdropMR))
print"Confusion Matrix: \n", confusion_matrix(y_TestdropM2, clf2.predict(X_TestdropM2))
test_predictionsselecdropMR = clf2.predict(X_TestdropM2)
confusionselecdropMR = metrics.confusion_matrix(y_TestdropM2, test_predictionsselecdropMR)
TNselecdropMR = confusionselecdropMR[0, 0]
TPselecdropMR = confusionselecdropMR[1, 1]
FNselecdropMR = confusionselecdropMR[1, 0]
FPselecdropMR = confusionselecdropMR[0, 1]
BenefitItemselecdropMR = TPselecdropMR
BenefitCoselecdropMR = TPselecdropMR / (TPselecdropMR + FNselecdropMR) # this is specificity in statistics
print("BenefitItemselecdropM: {}".format(BenefitItemselecdropMR))
print("BenefitCoselecdropM: {:.2%}".format(BenefitCoselecdropMR))
CostItemselecdropMR = (TPselecdropMR + FPselecdropMR)
CostCoselecdropMR = (TPselecdropMR + FPselecdropMR) / (TPselecdropMR + TNselecdropMR + FPselecdropMR +FNselecdropMR)
print("CostItemselecdropM: {}".format(CostItemselecdropMR))
print("CostCoselecdropM: {:.2%}".format(CostCoselecdropMR))
if CostCoselecdropMR == 0:
ImproveRatioselecdropMR = 0
else:
ImproveRatioselecdropMR = format((BenefitItemselecdropMR/CostItemselecdropMR) /((TPselecdropMR+FNselecdropMR) /(TPselecdropMR + TNselecdropMR + FPselecdropMR +FNselecdropMR)), '.2%')
print(CBLUE+"ImproveRatioselecdropM: {}".format(ImproveRatioselecdropMR)+CEND)
#features = Test.drop(['CARAVAN'], axis=1)
importances2 = clf2.feature_importances_
std2 = np.std([tree.feature_importances_ for tree in clf2.estimators_],
axis=0)
indices2 = np.argsort(importances2[0:22])[::-1]
indices = indices2[0:22]
# Print the feature ranking
print("Feature ranking:")
#features = Train.columns
for f in range(22):
print("%d. %s (%f)" % (f + 1, (TraindropM2.columns.values[:22]).reshape(-1)[indices[f]], importances2[indices[f]]))
# Plot the feature importances of the forest
#import pylab as pl
plt.figure(figsize=(14, 3))
plt.title("Feature importances")
plt.bar(range(22), importances2[indices], yerr=std2[indices], color="steelblue", align="center")
plt.yticks(size=14,color="#201506")
plt.xticks(range(22), TraindropM2.columns.values[:22].reshape(-1)[indices], rotation='vertical',size=12,color="#201506")
plt.xlim([-1, 12])
plt.show()
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import *
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from IPython.display import display
from itertools import compress
from math import isnan
from sklearn import tree
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division
classifiers = [
#Gruppe 1: K Neighbors
#Classification is computed from a simple majority vote of the nearest neighbors of each point:
#a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.
KNeighborsClassifier(3),
#Gruppe 2: Boosting
# Boosting is a machine learning ensemble meta-algorithm for primarily reducing bias, and also variance in supervised learning, and a family of machine learning algorithms that convert weak learners to strong ones.
# An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.
AdaBoostClassifier(),
# GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the binomial or multinomial deviance loss function. Binary classification is a special case where only a single regression tree is induced.
GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42),
# What: Light GBM is a gradient boosting framework that uses tree based learning algorithm.
# Pros: handle the large size of data and takes lower memory to run;focuses on accuracy of results.
# Cons: Light GBM is sensitive to overfitting and can easily overfit small data.
lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None,
colsample_bytree=0.6311794044268164,
learning_rate=0.027802518491219938, max_depth=-1, metric='auc',
min_child_samples=250, min_child_weight=0.001, min_split_gain=0.0,
n_estimators=138, n_jobs=-1, num_leaves=40, objective='binary',
random_state=50, reg_alpha=0.06183118355912668,
reg_lambda=0.24742831407472365, silent=True,
subsample=0.999742610271968, subsample_for_bin=280000,
subsample_freq=1, verbose=1),
# Neural Network performs not so good and actually not as suitable as other models in this situation
# MLPClassifier(activation='relu', alpha=1e-05,
# batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
# epsilon=1e-08, hidden_layer_sizes=(64), learning_rate='constant',
# learning_rate_init=0.001, max_iter=2000, momentum=0.9,
# nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
# tol=0.001, validation_fraction=0.1, verbose=True,
# warm_start=False),
#Gruppe 3: Trees
# lgb.LGBMClassifier and GradientBoostingClassifier in Gruppe 1 also uses tree methods
# Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
# Pruning: Remove test nodes whose leaves have less than 𝜏 instances. Collect in new leaf node that is labeled with the majority class
# Pruning parameter 𝜏 is a regularization parameter that has to be tuned (e.g., by cross validation).
DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=15,
min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=1e-07),
# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement if bootstrap=True (default).
# Random Forests are an improvement over bagged decision trees.
# In statistics, bootstrapping is any test or metric that relies on random sampling with replacement. #Bagging: Bootstrap aggregating, also called bagging, is a machine learning ensemble meta-algorithm designed to improve the stability and accuracy of machine learning algorithms used in statistical classification and regression. It also reduces variance and helps to avoid overfitting. Although it is usually applied to decision tree methods, it can be used with any type of method.
RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=15,
min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=1e-07,
bootstrap=True, oob_score=False, n_jobs=1,
random_state=42, verbose=1, warm_start=False, class_weight='balanced_subsample'),
# RF vs ET: Both methods are about the same, with the ET being a bit worse when there is a high number of noisy features (in high dimensional data-sets).That said, provided the (perhaps manual) feature selection is near optimal, the performance is about the same, however, ET's can be computationally faster.
# This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.
# The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
# The “balanced_subsample” mode is the same as “balanced” except that weights are computed based on the bootstrap sample for every tree grown.
ExtraTreesClassifier(n_estimators=500, criterion='gini', max_depth=15,
min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=1e-07,
bootstrap=True, oob_score=False, n_jobs=1,
random_state=42, verbose=1, warm_start=False, class_weight='balanced_subsample'),
#Gruppe 4: Bayes
# Bayes‘ equation: 𝑃(𝜃|𝐗,𝐲) =(𝑃(𝐲|𝐗,𝜃)/𝑃(𝜃))/ 𝑃(𝐲|𝐗)
# Classification: a posteriori (“posterior”) probability that θ is the correct parameter given observations 𝐲|𝐗.
# 𝑃(𝜃): A priori (“prior”) probability of nature choosing θ: System parameter 𝜃∗ (randomly)
# 𝑃(𝐲|𝐗,𝜃): Likelihood of observing 𝐲|𝐗 when model parameter is 𝜃.
# Probability of observing 𝐲|𝐗; independent of 𝜃.
# Maximum-likelihood(ML):𝜃ML = argmaxbelow𝜃 (𝑃(𝐲|𝐗,𝜃)) = argminbelow𝛉 ∑𝑖=1 𝑛 (log(1 + ehoch(−𝑦𝑖𝐱𝑖T𝛉))) <ML:Logistic Regression,using (stochastic) gradient descent>
# Maximum-a-positeriori(MAP): 𝜃MAP = argmaxbelow𝜃 (𝑃(𝜃|𝐲,𝐗)) = argminbelow𝛉 ∑𝑖=1 𝑛 (log(1 + e(hoch−𝑦𝑖𝐱𝑖T𝛉)) + (1/2𝜎((low𝑝)(hoch2))𝛉T𝛉 A posteriori (“posterior”) distribution: a posteriori (“posterior”) probability that θ is the correct parameter given observations 𝐲|𝐗.
# Bayes’ theorem Classification: Predictive distribution given the data 𝑃 (𝑦|𝐱∗,𝐲,𝐗) = ∫(𝑃(𝑦|𝛉,𝐱∗)𝑃(𝛉|𝐲,𝐗)d𝛉= ∫(1/(1 + e−𝑦𝐱∗T))𝛉𝑁(𝛉|𝟎,𝜎(hoch2)𝐈)𝑑𝛉. No closed-form solution for logistic regression. Possible to approximate by sampling from the posterior. Standard approximation: use only MAP model instead of integrating over model space.
# Bayes’ theorem describes the probability of an event, based on prior knowledge of conditions that might be related to the event.
# P(A\B)= (P(B\A)\P(A))\(P(B))where A and B are events
# P(A\B) is a conditional probability: the likelihood of event A occurring given that B is true.
# {P(B\ A)} is also a conditional probability: the likelihood of event B occurring given that {\displaystyle A} A is true.
# P(A) and P(B) are the probabilities of observing A and B independently of each other; this is known as the marginal probability.
#Gaussian Naive Bayes
#GaussianNB(),
#Gaussian: It is used in classification and it assumes that features follow a normal distribution.
#Multinomial: It is used for discrete counts. For example, let’s say, we have a text classification problem. Here we can consider bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.
# Linear Bayes classification ##http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
#A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule.
#The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.
#The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions.
# PDF(x|k)=ehoch(−d/2) / ((2π)hoch(p/2)) / square root(|S|) see https://stats.stackexchange.com/questions/31366/linear-discriminant-analysis-and-bayes-rule-classification/31384#31384 The relationship of LDA and regression is here: https://stats.stackexchange.com/questions/31459/what-is-the-relationship-between-regression-and-linear-discriminant-analysis-ld
LinearDiscriminantAnalysis(),
# Quadratic Discriminant Analysis
# QuadraticDiscriminantAnalysis(),
#Bernoulli: The binomial model is useful if your feature vectors are binary. One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.
#Naive Bayes is a simple technique for constructing classifiers: models that assign class labels to problem instances, represented as vectors of feature values, where the class labels are drawn from some finite set. #There is not a single algorithm for training such classifiers, but a family of algorithms based on a common principle: all naive Bayes classifiers assume that the value of a particular feature is independent of the value of any other feature, given the class variable.
#Pros:It is easy and fast to predict class of test data set. It also perform well in multi class predictionWhen assumption of independence holds, a Naive Bayes classifier performs better compare to other models like logistic regression and you need less training data.It perform well in case of categorical input variables compared to numerical variable(s). For numerical variable, normal distribution is assumed (bell curve, which is a strong assumption).
#Cons:If categorical variable has a category (in test data set), which was not observed in training data set, then model will assign a 0 (zero) probability and will be unable to make a prediction. This is often known as “Zero Frequency”. To solve this, we can use the smoothing technique. One of the simplest smoothing techniques is called Laplace estimation.On the other side naive Bayes is also known as a bad estimator, so the probability outputs from predict_proba are not to be taken too seriously.Another limitation of Naive Bayes is the assumption of independent predictors. In real life, it is almost impossible that we get a set of predictors which are completely independent.
BernoulliNB(alpha=1.0,fit_prior = True),
#Logictic Regression uses Log_Loss,but SVM Hinge Loss; Moreover Log_loss is not suitble for Naive Bayes
#Logistic Regression and SVM can both be High dimensional features with kernels
#Empirical risk minimization Gradient descent method Inexact line search Stochastic gradient descent methods
#The performance of SGDClassifier is not so good. Moreover, since this SGDClassifier normally uses Hinge loss, so I cannot calculate the ROC, which I use in the evaluation for all the classifiers, unless I set the loss to 'log'. so I commented this classifier.
#SGDClassifier(loss = 'log',penalty = 'elasticnet'),
####Linear classification Bayes####: Cost sensitive: Called multi-class “logistic regression” even though it is a classification model, linear, belongs to Bayes methods
#For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
#‘liblinear’ and ‘saga’ handle L1 penalty: square loss function is both convex and smooth and matches the 0–1 indicator function when {yf({\vec {x}})=0} yf({\vec {x}})=0 and when {yf({\vec {x}})=1} yf({\vec {x}})=1. However, the square loss function tends to penalize outliers excessively, leading to slower convergence rates (with regards to sample complexity) than for the logistic loss or hinge loss(L2) functions.https://www.csie.ntu.edu.tw/~cjlin/liblinear/
# Softmax Function Sigmoid Function: Logistic Regression generalize sigmoid function to softmax function 𝑃(𝑦|𝐱,𝛉) =ehoch𝐱T𝛉𝑦 / ∑low𝑦′ehoch𝐱T𝛉low𝑦′for 𝛉
# For binary classification, 𝑦 ∈ {−1,+1} #Decision boundary is a hyperplane in input space.
# 𝑃(𝑦=+-1|𝐱,𝛉) = 𝜎 (𝑦𝐱T𝛉) = 1 / (1+(ehoch(−𝑦𝐱T𝛉)))
LogisticRegression(solver='liblinear', max_iter=1000,
random_state=42,verbose=2,class_weight='balanced'), # class_weight='balanced' # penalize
#LogisticRegression(solver='saga', max_iter=1000, random_state=42,verbose=2),
# Gruppe 5: non-linear classification SVM using RBF kernel Trick
# Defi: Given a set of training examples, each marked as belonging to one or the other of two categories, an SVM training algorithm builds a model that assigns new examples to one category or the other, making it a non-probabilistic binary linear classifier (although methods such as Platt scaling exist to use SVM in a probabilistic classification setting).
# Defi: An SVM model is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible.
# SVM: SVM classifier with Gaussian kernel: RBF, Dual classifier, uses kernal trick:Gaussian kernel: RBF, uses squared Euclidean distance
# class_weight='balanced' # penalize
# Kernel functions can be understood as a measure of similarity between instances.
# Primal view on data: “what does 𝐱 look like?”
# Dual view on data: “how similar is 𝐱 to each training instance?”
# Primal view: 𝑓𝛉𝐱 = 𝛉T𝜙𝐱 Model 𝛉 has as many parameters as the dimensionality of 𝜙 𝐱 . Good if there are many examples with few attributes.
# Dual view: 𝑓𝛂𝐱 = 𝛂T𝚽𝜙𝐱 Model 𝛂 has as many parameters as there are examples. Good if there are few examples with many attributes. The representation 𝜙 𝐱 can even be infinite dimensional, as long as the inner product can be computed efficiently.
# Kernel Ridge Regression # Squared loss: ℓ𝟐 𝑓𝛉 𝐱𝑖 ,𝑦𝑖 = 𝑓𝛉𝐱𝑖 −𝑦𝑖𝟐 L2 regularization: Ω2 𝛉 = ||𝛉||22 ## Minimize 𝑳 𝛉 = 𝛉T𝜙 𝐱 −𝑦𝑖 2 +𝜆𝛉T𝛉
# Optimization criterion of the dual SVM: max 𝛃𝛽𝑖 − 𝑛𝑖=11 2 𝛽𝑖𝛽𝑗𝑦𝑖𝑦𝑗𝑘 𝐱𝑖,𝐱𝑗 𝑛 𝑖,𝑗=1 Optimization over parameters 𝛃. Solution found with QP-Solver in 𝑂 𝑛2 . Sparse solution. Samples only appear as pairwise inner products.
# Primal SVM: Solution is a Vector 𝛉 in the space of the attributes. Dual SVM: The same solution is represented as weights 𝛽𝑖 of the samples.
# Kernel matrices are symmetric: 𝐊 = 𝐊T Kernel matrices 𝐊 ∈ ℝ𝑛×𝑛 are positive semidefinite: ∃𝚽 ∈ ℝ𝑛×𝑚:𝐊 = 𝚽𝚽T Kernel function 𝑘 𝐱,𝐱′ is positive semidefinite if 𝐊 is positive semidefinite for every data set. For every positive definite function 𝑘 there is at least one mapping 𝜙 𝐱 such that 𝑘 𝐱,𝐱′ = 𝜙 𝐱 T𝜙 𝐱′ for all 𝐱 and 𝐱′.
# Polynomial kernels: 𝑘𝑝𝑜𝑙𝑦 𝐱𝑖,𝐱𝑗 = 𝐱𝑖 T𝐱𝑗 +1 𝑝 Radial basis functions: 𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = 𝑒−𝛾 𝐱𝑖−𝐱𝑗 2 Sigmoid kernels, Dynamic time-warping kernels, String kernels, Graph kernels,
# Kernel function 𝑘 𝐱,𝐱′ = 𝜙 𝐱 T𝜙 𝐱′ computes the inner product of the feature mapping of instances. The kernel function can often be computed without an explicit representation 𝜙 𝐱 . E.g., polynomial kernel: 𝑘𝑝𝑜𝑙𝑦 𝐱𝑖,𝐱𝑗 = 𝐱𝑖 T𝐱𝑗 +1 𝑝 Infinite-dimensional feature mappings are possible Eg., RBF kernel: 𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = 𝑒−𝛾 𝐱𝑖−𝐱𝑗 2 Kernel functions for time series, strings, graphs, … For a given kernel matrix, the Mercer map provides a feature mapping. Useful if a learning problem is given as a kernel function but learning should take place in the primal. For example if the kernel matrix will be too large (quadratic memory consumption!
# Representer Theorem: 𝑓𝛉∗ 𝐱 = 𝛼𝑖 ∗𝜙 𝐱𝑖 T𝜙 𝐱𝑛 𝑖=1 Instances only interact through inner products Great for few instances, many attributes Kernel learning algorithms: Kernel ridge regression Kernel perceptron, SVM
# Kernel: 𝑘𝑅𝐵𝐹 𝐱𝑖,𝐱𝑗 = exp hoch(−𝛾𝐱𝑖 −𝐱𝑗hoch2) No finite-dimensional feature mapping 𝜙.
# Empirical risk minimization for a classification problem with a 0-1 loss function is known to be an NP-hard problem even for such a relatively simple class of functions as linear classifiers.[2] Though, it can be solved efficiently when the minimal empirical risk is zero, i.e. data is linearly separable.
# Empirical risk minimization Gradient descent method Inexact line search Stochastic gradient descent methods
# In practice, machine learning algorithms cope with that either by employing a convex approximation to the 0-1 loss function (like hinge loss for SVM), which is easier to optimize, or by imposing assumptions on the distribution {\displaystyle P(x,y)} P(x,y) (and thus stop being agnostic learning algorithms to which the above result applies).
# Support vector machines Gradient or stochastic gradient, hinge loss, L2regularizer. Maximizes margin between instances and plane.
SVC(C=10, class_weight='balanced', gamma='auto', kernel='rbf',
max_iter=-1, probability=True, random_state=42, verbose=True)] # Linear: Etra Trees: This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.
#from costcla.sampling import cost_sampling
#from costcla.metrics import savings_score
#from costcla import models
#data = TraindropM.ix[:, 1:-5]
#sets = train_test_split(X_TraindropM, y, cost_mat =[[],[],[],[]], test_size=0.3,random_state=42)
#X_trainC, X_testC, y_trainC, y_testC, cost_mat_trainC, cost_mat_testC = sets
#y_pred_test_lr = LogisticRegression(random_state=0).fit(X_trainC, y_trainC).predict(X_testC)
#f = CostSensitiveLogisticRegression()
#f.fit(X_trainC, y_trainC, cost_mat_trainC)
#y_pred_test_cslr = f.predict(X_testC)
# define the confusion matrix
import csv
from sklearn.metrics import brier_score_loss
#lldropM = log_loss(y_TestdropM, test_predictionsdropM
#def logloss(true_label, lldropM):
#if true_label == 1:
# return -log(lldropM)
# else:
# return -log(1 - lldropM)
def draw_confusion_matricesdropM(confusion_matriciesdropM,class_namesdropM):
class_namesdropM = class_namesdropM.tolist()
for cm in confusion_matricesdropM:
classifier, cm = cm[0], cm[1]
print'Confusion matrixdropM:\n', cm
fig = plt.figure()
ax = fig.add_subplot(111)
sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells
plt.ylabel('True')
plt.xlabel('Predicted')
ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
plt.show()
class_namesdropM = np.unique(np.array(y_TestdropM))
# Logging for Visual Comparison
log_colsUdropM = ["Classifier", "F-score","Accuracy"]
logUdropM = pd.DataFrame(columns=log_colsUdropM)
log_colsdropM=["Classifier", "Log Loss","OverfittingRoc","BLoss","BI","BO", "CI", "CO", "IR","BTOR","PR"]
logdropM = pd.DataFrame(columns=log_colsdropM)
log_colsCBAdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropM = pd.DataFrame(columns=log_colsCBAdropM)
for clf in classifiers:
clf.fit(X_TraindropM, y_TraindropM)
namedropM = clf.__class__.__name__
print("="*110)
print(namedropM)
print(str(clf));print('\n')
print('****************ResultsdropM****************')
print('\n----------------Unhelpful Scores\n')
# The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal.
# F1 = 2 * (precision * recall) / (precision + recall)#precision: tp / (tp + fp) # Recall: tp / (tp + fn)
# We do not care about fp: not caravan predicted to be caraven, but f1 uses precision and precision uses fp, so this rate is not so suitble in this case
test_predictionsdropM = clf.predict(X_TestdropM)
accdropM = metrics.f1_score(y_TestdropM, test_predictionsdropM)
print("F-scoredropM: {:.2%}".format(accdropM))
# In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
# In multilabel classification, the function returns the subset accuracy. If the entire set of predicted labels for a sample strictly match with the true set of labels, then the subset accuracy is 1.0; otherwise it is 0.0.
# We do not care so much, if we can predict "0" right or not, and the data is very imbalanced, since it has lots "0"
test_predictionsdropM = clf.predict(X_TestdropM)
acc2dropM = accuracy_score(y_TestdropM, test_predictionsdropM)
print('Model accuracydropM: {:.2%} '.format(acc2dropM))
# Example of Receiver Operating Characteristic (ROC) metric to evaluate classifier output quality.
# ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis. This means that the top left corner of the plot is the “ideal” point - a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that a larger area under the curve (AUC) is usually better.
# The “steepness” of ROC curves is also important, since it is ideal to maximize the true positive rate while minimizing the false positive rate.
print('ROC just use this to check overfitting:\n ')
#If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting
test_predictionsdropM = clf.predict(X_TestdropM)
RocScore=roc_auc_score(y_TestdropM, test_predictionsdropM)
fprB, tprB, thresholdsB = roc_curve(y_TestdropM, clf.predict_proba(X_TestdropM)[:,1])
RocScoreTrain=roc_auc_score(y_TraindropM, clf.predict(X_TraindropM))
fprBTrain, tprBTrain, thresholdsBTrain = roc_curve(y_TraindropM, clf.predict_proba(X_TraindropM)[:,1])
OverfittingRoc=float(format(RocScore-RocScoreTrain,'.2f'))
print RocScore
print RocScoreTrain
plt.figure()
plt.plot(fprB, tprB, label='classifiersTest' % RocScore)
plt.plot(fprBTrain, tprBTrain, label='classifiersTrain' % RocScoreTrain)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
# Trees and Logictic Regression uses Log_Loss,but SVM Hinge Loss; Moreover Log_loss is not suitble for Naive Bayes
# Log loss, aka logistic loss or cross-entropy loss.
# Log loss, aka logistic loss or cross-entropy loss.
# used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of the true labels given a probabilistic classifier’s predictions. The log loss is only defined for two or more labels. For a single sample with true label yt in {0,1} and estimated probability yp that yt = 1, the log loss is
#-log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
# Log loss penalizes both types of errors, but especially those predications that are confident and wrong!
# This is not good, since the wrongly predicted caravan (FP) at the time point of the extraction of the data may become TP!
test_predictionsdropM = clf.predict_proba(X_TestdropM)
lldropM = log_loss(y_TestdropM, test_predictionsdropM)
print("Log LossdropM: {:.2f}".format(lldropM))
test_predictionsdropM = clf.predict(X_TestdropM)
# This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
# Loss Functions for Classification: Zero-one loss, Logistic loss,Perceptron loss, Hinge Loss: not all the models use log_loss
BLoss= brier_score_loss(y_TestdropM, test_predictionsdropM)
print("Brier score loss: {:.2f}".format(BLoss))
confusiondropM = metrics.confusion_matrix(y_TestdropM, test_predictionsdropM)
TNdropM = confusiondropM[0, 0]
TPdropM = confusiondropM[1, 1]
FNdropM = confusiondropM[1, 0]
FPdropM = confusiondropM[0, 1]
BenefitItemdropM = TPdropM
BenefitCodropM = TPdropM / (TPdropM + FNdropM) # this is specificity in statistics
print("BenefitItemdropM: {}".format(BenefitItemdropM))
print("BenefitCodropM: {:.2%}".format(BenefitCodropM))
CostItemdropM = (TPdropM + FPdropM)
CostCodropM = (TPdropM + FPdropM) / (TPdropM + TNdropM + FPdropM +FNdropM)
print("CostItemdropM: {}".format(CostItemdropM))
print("CostCodropM: {:.2%}".format(CostCodropM))
if CostCodropM == 0:
ImproveRatiodropM = 0
else:
ImproveRatiodropM = (BenefitItemdropM/CostItemdropM) /((TPdropM+FNdropM) /(TPdropM + TNdropM + FPdropM+FNdropM))
print(CBLUE+"ImproveRatiodropM: {:.2%}".format(ImproveRatiodropM)+CEND)
#scenario BenefitItem*price-CostItem*
if BenefitItemdropM == 0:
balancetradeoffradiodropM = 0
else:
balancetradeoffradiodropM = float(format(CostItemdropM/BenefitItemdropM , '.2f'))
if CostCodropM == 0:
ProfitratiodropM=0
else:
ProfitratiodropM = float(format(BenefitItemdropM/CostItemdropM, '.2f'))
print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM)+CEND)
print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropM)+CEND)
print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
# #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent
Budget = 200000
#One scenario considers benefitgoal, one scenario considers the balance, no deficit
ProfitGoal= 20000
# Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management
ProfitPerBenefitItem = 700
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
# within this budget, the smallest amount of target audience should be reached, in order to keep the balance of the account
# Min CostItem
if balancetradeoffradiodropM == 0:
MinTargetAudienceBdropM = 0
else:
MinTargetAudienceBdropM= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropM)
#For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
if MinTargetAudienceBdropM == 0:
MaxBudgetPerTargetAudienceBdropM = 0
else:
MaxBudgetPerTargetAudienceBdropM = float(format(Budget/MinTargetAudienceBdropM, '.2f'))
#To reach the benifit goal, at least MinTargetAudienceP should be reached
if balancetradeoffradiodropM == 0:
MinTargetAudiencePdropM = 0
else:
MinTargetAudiencePdropM = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropM)
#To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent
if MinTargetAudienceBdropM == 0:
MaxBudgetPerTargetAudiencePdropM = 0
else:
MaxBudgetPerTargetAudiencePdropM = float(format(Budget/MinTargetAudiencePdropM, '.2f'))
print("MinTargetAudienceBdropM: {}".format(MinTargetAudienceBdropM))
print("MaxBudgetPerTargetAudienceBdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropM))
print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
print("MinTargetAudiencePdropM: {}".format(MinTargetAudiencePdropM))
print("MaxBudgetPerTargetAudiencePdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropM)+"\n")
log_entryUdropM = pd.DataFrame([[namedropM, accdropM,acc2dropM]], columns=log_colsUdropM)#FPR*100,
logUdropM = logUdropM.append(log_entryUdropM,ignore_index=True)
log_entrydropM = pd.DataFrame([[namedropM,lldropM,OverfittingRoc,BLoss,BenefitItemdropM,BenefitCodropM, CostItemdropM, CostCodropM,ImproveRatiodropM,balancetradeoffradiodropM,ProfitratiodropM]], columns=log_colsdropM)#FPR*100,
logdropM = logdropM.append(log_entrydropM,ignore_index=True)
log_entryCBAdropM = pd.DataFrame([[namedropM,MinTargetAudienceBdropM,MaxBudgetPerTargetAudienceBdropM,MinTargetAudiencePdropM,MaxBudgetPerTargetAudiencePdropM]], columns=log_colsCBAdropM)
logCBAdropM = logCBAdropM.append(log_entryCBAdropM,ignore_index=True)
reportdropM = classification_report(y_TestdropM, test_predictionsdropM)
print(reportdropM)
confusion_matricesdropM = [
( "", confusion_matrix(y_TestdropM, test_predictionsdropM))
]
draw_confusion_matricesdropM(confusion_matricesdropM,class_namesdropM)
predictions = clf.predict(OutputdropM)
print predictions
#new_column = df['Classifiers']
# pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
with open('C:\Users\chenp\Desktop\output.4.3.1.1.csv', 'a') as csvfile:#, newline=''
fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#,
fwriter.writerow(predictions)
#numpy.savetxt('C:/localpath/test.csv',prediction, ,delimiter=',')
#pd.read_csv(r'C:\Users\chenp\Desktop\00_CS_Master_Kurse_SS2018\ML1_IDA\P6_V\caravan.output.csv', sep='\t',
def draw_confusion_matricesselecdropM(confusion_matriciesselecdropM,class_namesselecdropM):
class_namesselecdropM = class_namesselecdropM.tolist()
for cm in confusion_matricesselecdropM:
classifier, cm = cm[0], cm[1]
print'Confusion matrixselecdropM:\n', cm
fig = plt.figure()
ax = fig.add_subplot(111)
sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells
plt.ylabel('True')
plt.xlabel('Predicted')
ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
plt.show()
class_namesselecdropM = np.unique(np.array(y_TestselecdropM))
# Logging for Visual Comparison
log_colsselecUdropM = ["Classifier", "F-score","Accuracy"]
logselecUdropM = pd.DataFrame(columns=log_colsselecUdropM)
log_colsselecdropM=["Classifier", "Log Loss","OverfittingRoc", "BLoss","BI","BO", "CI", "CO", "IR","BTOR","PR"]
logselecdropM = pd.DataFrame(columns=log_colsselecdropM)
log_colsCBAselecdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAselecdropM = pd.DataFrame(columns=log_colsCBAselecdropM)
for clf in classifiers:
clf.fit(X_TrainselecdropM, y_TrainselecdropM)
nameselecdropM = clf.__class__.__name__
print("="*110)
print(nameselecdropM)
print('****ResultsselecdropM****')
print('\n----------------Unhelpful Scores\n')
test_predictionsselecdropM = clf.predict(X_TestselecdropM)
accselecdropM = f1_score(y_TestselecdropM, test_predictionsselecdropM)
print("F-scoreselecdropM: {:.2%}".format(accselecdropM))
test_predictionsselecdropM = clf.predict(X_TestselecdropM)
acc2selecdropM = accuracy_score(y_TestselecdropM, test_predictionsselecdropM)
print('Model accuracyselecdropM: {:.2%} '.format(acc2selecdropM))
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
print('ROC just used here this to check overfitting(since if focus on positive values(0 here, not caravan), but we cares more about negative values): \nIf both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.')
RocScoreselec=roc_auc_score(y_TestselecdropM, clf.predict(X_TestselecdropM))
fprBselec, tprBselec, thresholdsBselec = roc_curve(y_TestselecdropM, clf.predict_proba(X_TestselecdropM)[:,1])
RocScoreTrainselec=roc_auc_score(y_TrainselecdropM, clf.predict(X_TrainselecdropM))
fprBTrainselec, tprBTrainselec, thresholdsBTrainselec = roc_curve(y_TrainselecdropM, clf.predict_proba(X_TrainselecdropM)[:,1])
OverfittingRocselec=float(format(RocScoreselec-RocScoreTrainselec,'.2f'))
print RocScoreselec
print RocScoreTrainselec
plt.figure()
plt.plot(fprBselec, tprBselec, label='classifiersTest' % RocScoreselec)
plt.plot(fprBTrainselec, tprBTrainselec, label='classifiersTrain' % RocScoreTrainselec)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
test_predictionsdropM = clf.predict_proba(X_TestselecdropM)
llselecdropM = log_loss(y_TestselecdropM, test_predictionsdropM)
print("Log LossselecdropM: {:.2f}".format(llselecdropM))
test_predictionsdropM = clf.predict(X_TestselecdropM)
# This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
BLossselec= brier_score_loss(y_TestselecdropM, test_predictionsselecdropM)
print("Brier score loss: {:.2f}".format(BLossselec))
test_predictionsselecdropM = clf.predict(X_TestselecdropM)
confusionselecdropM = metrics.confusion_matrix(y_TestselecdropM, test_predictionsselecdropM)
TPselecdropM = confusionselecdropM[0, 0]
TNselecdropM = confusionselecdropM[1, 1]
FPselecdropM = confusionselecdropM[1, 0]
FNselecdropM = confusionselecdropM[0, 1]
BenefitItemselecdropM = TNselecdropM
BenefitCoselecdropM = TNselecdropM / (TNselecdropM + FPselecdropM) # this is specificity in statistics
print("BenefitItemselecdropM: {}".format(BenefitItemselecdropM))
print("BenefitCoselecdropM: {:.2%}".format(BenefitCoselecdropM))
CostItemselecdropM = (TNselecdropM + FNselecdropM)
CostCoselecdropM = (TNselecdropM + FNselecdropM) / (TPselecdropM + TNselecdropM + FPselecdropM +FNselecdropM)
print("CostItemselecdropM: {}".format(CostItemselecdropM))
print("CostCoselecdropM: {:.2%}".format(CostCoselecdropM))
if CostCoselecdropM == 0:
ImproveRatioselecdropM = 0
else:
ImproveRatioselecdropM = (BenefitItemselecdropM/CostItemselecdropM) /((TNselecdropM+FPselecdropM) /(TPselecdropM + TNselecdropM + FPselecdropM +FNselecdropM))
print(CBLUE+"ImproveRatioselecdropM: {:.2%}".format(ImproveRatioselecdropM)+CEND)
#scenario BenefitItem*price-CostItem*
if BenefitItemselecdropM == 0:
balancetradeoffradioselecdropM = 0
else:
balancetradeoffradioselecdropM = float(format(CostItemselecdropM/BenefitItemselecdropM , '.2f'))
if CostCoselecdropM == 0:
ProfitratioselecdropM=0
else:
ProfitratioselecdropM = float(format(BenefitItemselecdropM/CostItemselecdropM, '.2f'))
print(CBLUE+"balancetradeoffradioselecdropM: {0:.2f}".format(balancetradeoffradioselecdropM)+CEND)
print(CBLUE+"ProfitratioselecdropM: {0:.2f}".format(ProfitratioselecdropM)+CEND)
print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
# #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent
Budget = 200000
#One scenario considers benefitgoal, one scenario considers the balance, no deficit
ProfitGoal= 20000
# Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management
ProfitPerBenefitItem = 700
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+CBLUE+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n'+CEND)
# within this budget, the smallest amount of target audience should be reached, in order to keep the balance of the account
# Min CostItem
if balancetradeoffradioselecdropM == 0:
MinTargetAudienceBselecdropM = 0
else:
MinTargetAudienceBselecdropM= int((Budget/ProfitPerBenefitItem)*balancetradeoffradioselecdropM)
#For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
if MinTargetAudienceBselecdropM == 0:
MaxBudgetPerTargetAudienceBselecdropM = 0
else:
MaxBudgetPerTargetAudienceBselecdropM = float(format(Budget/MinTargetAudienceBselecdropM, '.2f'))
#To reach the benifit goal, at least MinTargetAudienceP should be reached
if balancetradeoffradioselecdropM == 0:
MinTargetAudiencePselecdropM = 0
else:
MinTargetAudiencePselecdropM = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradioselecdropM)
#To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent
if MinTargetAudienceBselecdropM == 0:
MaxBudgetPerTargetAudiencePselecdropM = 0
else:
MaxBudgetPerTargetAudiencePselecdropM = float(format(Budget/MinTargetAudiencePselecdropM, '.2f'))
print(CRED+"MinTargetAudienceBselecdropM: {}".format(MinTargetAudienceBselecdropM)+CEND)
print(CRED+"MaxBudgetPerTargetAudienceBselecdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBselecdropM)+CEND)
print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
print(CRED+"MinTargetAudiencePselecdropM: {}".format(MinTargetAudiencePselecdropM)+CEND)
print(CRED+"MaxBudgetPerTargetAudiencePselecdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePselecdropM)+CEND+"\n")
log_entryselecUdropM = pd.DataFrame([[nameselecdropM, accselecdropM,acc2selecdropM]], columns=log_colsselecUdropM)#FPR*100,
logselecUdropM = logselecUdropM.append(log_entryselecUdropM,ignore_index=True)
log_entryselecdropM = pd.DataFrame([[nameselecdropM,llselecdropM,OverfittingRocselec, BLossselec,BenefitItemselecdropM,BenefitCoselecdropM, CostItemselecdropM, CostCoselecdropM,ImproveRatioselecdropM,balancetradeoffradioselecdropM,ProfitratioselecdropM]], columns=log_colsselecdropM)#FPR*100,
logselecdropM = logselecdropM.append(log_entryselecdropM,ignore_index=True)
log_entryCBAselecdropM = pd.DataFrame([[nameselecdropM,MinTargetAudienceBselecdropM,MaxBudgetPerTargetAudienceBselecdropM,MinTargetAudiencePselecdropM,MaxBudgetPerTargetAudiencePselecdropM]], columns=log_colsCBAselecdropM)
logCBAselecdropM = logCBAselecdropM.append(log_entryCBAselecdropM,ignore_index=True)
reportselecdropM = classification_report(y_TestselecdropM, test_predictionsselecdropM)
print(reportselecdropM)
confusion_matricesselecdropM = [
( "", confusion_matrix(y_TestselecdropM, test_predictionsselecdropM))
]
draw_confusion_matricesselecdropM(confusion_matricesselecdropM,class_namesselecdropM)
print("="*60)
print(str(clf));print('\n')
predictions = clf.predict(OutputselecdropM)
print predictions
#new_column = df['Classifiers']
# pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
with open('C:\Users\chenp\Desktop\output.selec.4.3.1.2.csv', 'a') as csvfile:#, newline=''
fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#,
fwriter.writerow(predictions)
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division
def draw_confusion_matricesdropM2(confusion_matricesdropM2,class_namesdropM2):
class_namesdropM2 = class_namesdropM2.tolist()
for cm in confusion_matricesdropM2:
classifier, cm = cm[0], cm[1]
print'Confusion matrixdropM:\n', cm
fig = plt.figure()
ax = fig.add_subplot(111)
sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells
plt.ylabel('True')
plt.xlabel('Predicted')
ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
plt.show()
class_namesdropM2 = np.unique(np.array(y_TestdropM2))
# Logging for Visual Comparison
log_colsUdropM2 = ["Classifier", "F-score","Accuracy"]
logUdropM2 = pd.DataFrame(columns=log_colsUdropM2)
log_colsdropM2=["Classifier","Log Loss","OverfittingRoc", "BLoss2","BI","BO", "CI", "CO", "IR","BTOR","PR"]
logdropM2 = pd.DataFrame(columns=log_colsdropM2)
log_colsCBAdropM2 =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropM2 = pd.DataFrame(columns=log_colsCBAdropM2)
CBLACK = '\33[30m'
CRED = '\33[31m'
CGREEN = '\33[32m'
CYELLOW = '\33[33m'
CBLUE = '\33[34m'
CVIOLET = '\33[35m'
CBEIGE = '\33[36m'
CWHITE = '\33[37m'
CBLACKBG = '\33[40m'
CREDBG = '\33[41m'
CGREENBG = '\33[42m'
CYELLOWBG = '\33[43m'
CBLUEBG = '\33[44m'
CVIOLETBG = '\33[45m'
CBEIGEBG = '\33[46m'
CWHITEBG = '\33[47m'
CGREY = '\33[90m'
CEND = '\033[0m'
for clf in classifiers:
clf.fit(X_TraindropM2, y_TraindropM2)
namedropM2 = clf.__class__.__name__
print("="*110)
print(namedropM2)
print(str(clf));print('\n')
print('****************ResultsdropM****************')
print('\n----------------Unhelpful Scores\n')
test_predictionsdropM2 = clf.predict(X_TestdropM2)
accdropM2 = f1_score(y_TestdropM2, test_predictionsdropM2)
print("F-scoredropM: {:.2%}".format(accdropM2))
test_predictionsdropM2 = clf.predict(X_TestdropM2)
acc2dropM2 = accuracy_score(y_TestdropM, test_predictionsdropM2)
print('Model accuracydropM: {:.2%} '.format(acc2dropM2))
print('ROC just use this to check overfitting: \n')
#If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
RocScore2=roc_auc_score(y_TestdropM2, test_predictionsdropM2)
fprB2, tprB2, thresholdsB2 = roc_curve(y_TestdropM2, clf.predict_proba(X_TestdropM2)[:,1])
RocScoreTrain2=roc_auc_score(y_TraindropM2, clf.predict(X_TraindropM2))
fprBTrain2, tprBTrain2, thresholdsBTrain2 = roc_curve(y_TraindropM2, clf.predict_proba(X_TraindropM2)[:,1])
OverfittingRoc2=float(format(RocScore2-RocScoreTrain2,'.2f'))
print RocScore2
print RocScoreTrain2
plt.figure()
plt.plot(fprB2, tprB2, label='classifiersTest' % RocScore2)
plt.plot(fprBTrain2, tprBTrain2, label='classifiersTrain' % RocScoreTrain2)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
test_predictionsdropM2 = clf.predict_proba(X_TestdropM2)
lldropM2 = log_loss(y_TestdropM, test_predictionsdropM2)
print("Log LossdropM: {:.2f}".format(lldropM2))
test_predictionsdropM2 = clf.predict(X_TestdropM2)
# This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
BLoss2= brier_score_loss(y_TestdropM2, test_predictionsdropM2)
print("Brier score loss: {:.2f}".format(BLoss2))
test_predictionsdropM2 = clf.predict(X_TestdropM2)
confusiondropM2 = metrics.confusion_matrix(y_TestdropM2, test_predictionsdropM2)
TPdropM2 = confusiondropM2[0, 0]
TNdropM2 = confusiondropM2[1, 1]
FPdropM2 = confusiondropM2[1, 0]
FNdropM2 = confusiondropM2[0, 1]
BenefitItemdropM2 = TNdropM2
BenefitCodropM2 = TNdropM2 / (TNdropM2 + FPdropM2) # this is specificity in statistics
print("BenefitItemdropM: {}".format(BenefitItemdropM2))
print("BenefitCodropM: {:.2%}".format(BenefitCodropM2))
CostItemdropM2 = (TNdropM2 + FNdropM2)
CostCodropM2 = (TNdropM2 + FNdropM2) / (TPdropM2 + TNdropM2 + FPdropM2 +FNdropM2)
print("CostItemdropM: {}".format(CostItemdropM2))
print("CostCodropM: {:.2%}".format(CostCodropM2))
ImproveRatiodropM2 = (BenefitItemdropM2/CostItemdropM2) /((TNdropM2+FPdropM2) /(TPdropM2 + TNdropM2 + FPdropM2+FNdropM2))
print("ImproveRatio: {:.2%}".format(ImproveRatiodropM2))
#scenario BenefitItem*price-CostItem*
balancetradeoffradiodropM2 = float(format(CostItemdropM2/BenefitItemdropM2 , '.2f'))
print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM2)+CEND)
ProfitratiodropM2 = float(format(BenefitItemdropM/CostItemdropM2, '.2f'))
print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropM2)+CEND)
print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropM2)+CEND)
print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
# #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent
Budget = 200000
#One scenario considers benefitgoal, one scenario considers the balance, no deficit
ProfitGoal= 20000
# Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management
ProfitPerBenefitItem = 700
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
# within this budget, the smallest amount of target audience should be reached, in order to keep the balance of the account
# Min CostItem
MinTargetAudienceBdropM2= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropM2)
#For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
MaxBudgetPerTargetAudienceBdropM2 = float(format(Budget/MinTargetAudienceBdropM2, '.2f'))
#To reach the benifit goal, at least MinTargetAudienceP should be reached
MinTargetAudiencePdropM2 = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropM2)
#To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent
MaxBudgetPerTargetAudiencePdropM2 = float(format(Budget/MinTargetAudiencePdropM2, '.2f'))
print("MinTargetAudienceBdropM: {}".format(MinTargetAudienceBdropM2))
print("MaxBudgetPerTargetAudienceBdropM: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropM2))
print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
print("MinTargetAudiencePdropM: {}".format(MinTargetAudiencePdropM2))
print("MaxBudgetPerTargetAudiencePdropM: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropM2)+"\n")
log_entryUdropM2 = pd.DataFrame([[namedropM2, accdropM2,acc2dropM2]], columns=log_colsUdropM2)#FPR*100,
logUdropM2 = logUdropM2.append(log_entryUdropM2,ignore_index=True)
log_entrydropM2 = pd.DataFrame([[namedropM2,lldropM2,OverfittingRoc2,BLoss2,BenefitItemdropM2,BenefitCodropM2, CostItemdropM2, CostCodropM2,ImproveRatiodropM2,balancetradeoffradiodropM2,ProfitratiodropM2]], columns=log_colsdropM2)#FPR*100,
logdropM2 = logdropM2.append(log_entrydropM2,ignore_index=True)
log_entryCBAdropM2 = pd.DataFrame([[namedropM2,MinTargetAudienceBdropM2,MaxBudgetPerTargetAudienceBdropM2,MinTargetAudiencePdropM2,MaxBudgetPerTargetAudiencePdropM2]], columns=log_colsCBAdropM2)
logCBAdropM2 = logCBAdropM2.append(log_entryCBAdropM2,ignore_index=True)
reportdropM2 = classification_report(y_TestdropM2, test_predictionsdropM2)
print(reportdropM2)
confusion_matricesdropM2 = [
( "", confusion_matrix(y_TestdropM2, test_predictionsdropM2))
]
draw_confusion_matricesdropM2(confusion_matricesdropM2,class_namesdropM2)
predictions = clf.predict(OutputdropM2)
print predictions
#new_column = df['Classifiers']
# pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
with open('C:\Users\chenp\Desktop\output2.4.3.1.3.csv', 'a') as csvfile:#, newline=''
fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#,
fwriter.writerow(predictions)
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from __future__ import division
from sklearn.model_selection import StratifiedKFold
#Confusion matrix and confusion tables:¶
#The columns represent the actual class and the rows represent the predicted class. Lets evaluate performance:
def draw_confusion_matricesdropMSS(confusion_matriciesdropMSS,class_namesdropMSS):
class_namesdropMSS = class_namesdropMSS.tolist()
for cm in confusion_matricesdropMSS:
classifier, cm = cm[0], cm[1]
print'Confusion matrixdropM:\n', cm
fig = plt.figure()
ax = fig.add_subplot(111)
sns.heatmap(cm, annot=True, ax = ax,cmap='Blues', fmt='g'); #annot=True to annotate cells
plt.ylabel('True')
plt.xlabel('Predicted')
ax.xaxis.set_ticklabels(['Not Caravan', 'Caraven'],horizontalalignment="center"); ax.yaxis.set_ticklabels(['Not Caravan', 'Caraven'],rotation=45);
plt.show()
# Logging for Visual Comparison
log_colsUdropMSS = ["Classifier", "F-score","Accuracy"]
logUdropMSS = pd.DataFrame(columns=log_colsUdropMSS)
log_colsdropMSS=["Classifier", "Log Loss","OverfittingRoc","BLossS","BI","BO", "CI", "CO", "IR","BTOR","PR"]
logdropMSS = pd.DataFrame(columns=log_colsdropMSS)
log_colsCBAdropMSS =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
logCBAdropMSS = pd.DataFrame(columns=log_colsCBAdropMSS)
for clf in classifiers:
skf = StratifiedKFold(n_splits=3,random_state=43,shuffle=True)
print skf
skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X,y):
print("TRAIN:", train_index, "TEST:", test_index)
X_Train, X_Test = X.iloc[train_index], X.iloc[test_index]
y_Train, y_Test = y.iloc[train_index], y.iloc[test_index]
class_namesdropMSS = np.unique(np.array(y_Test))
X_Train=X_Train.drop(["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD",
"C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
"C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
"C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD",
"C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC",
"C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
"C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
"C42MINKGEM", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND"], axis=1)
X_Test =X_Test.drop(["C1MOSTYPE", "C2MAANTHUI", "C3MGEMOMV","C4MGEMLEEF", "C5MOSHOOFD",
"C6MGODRK","C7MGODPR", "C8MGODOV", "C9MGODGE","C10MRELGE", "C11MRELSA",
"C12MRELOV","C13MFALLEEN", "C14MFGEKIND", "C15MFWEKIND","C16MOPLHOOG", "C17MOPLMIDD",
"C18MOPLLAAG","C19MBERHOOG", "C20MBERZELF", "C21MBERBOER","C22MBERMIDD",
"C23MBERARBG", "C24MBERARBO","C25MSKA", "C26MSKB1", "C27MSKB2","C28MSKC",
"C29MSKD", "C30MHHUUR","C31MHKOOP", "C32MAUT1", "C33MAUT2","C34MAUT0", "C35MZFONDS",
"C36MZPART","C37MINKMthirty", "C38MINK3045", "C39MINK4575","C40MINK7512", "C41MINK123M",
"C42MINKGEM", "C65AWAPART",
"C66AWABEDR","C67AWALAND","C68APERSAUT", "C69ABESAUT", "C70AMOTSCO","C71AVRAAUT",
"C72AAANHANG", "C73ATRACTOR","C74AWERKT", "C75ABROM", "C76ALEVEN","C77APERSONG",
"C78AGEZONG", "C79AWAOREG","C80ABRAND","C81AZEILPL", "C82APLEZIER", "C83AFIETS",
"C84AINBOED", "C85ABYSTAND"], axis=1)
# Apply regular SMOTE
sm = SMOTE(kind='regular')
X_trainSS, y_trainSS = sm.fit_sample(X_Train, y_Train)
print('Training Set Shape after oversampling: ', X_trainSS.shape, y_trainSS.shape)
print(pd.crosstab(y_trainSS,y_trainSS))
#print("TRAIN:", X_Train)
#cv = cross_validation.StratifiedKFold(y_TraindropM, n_folds=3, random_state=42)
#test_predictionsdropM = cross_validation.cross_val_predict(clf, X=X_TraindropM, y=y_TraindropM, n_jobs=-1, cv=cv)
clf.fit(X_trainSS, y_trainSS)
namedropMSS = clf.__class__.__name__
print("="*110)
print(namedropMSS)
print(str(clf));print('\n')
print('****************ResultsdropM****************')
print('\n----------------Unhelpful Scores\n')
test_predictionsSS = clf.predict(X_Test)
accdropMSS = f1_score(y_Test, clf.predict(X_Test))
print("F-scoredropM: {:.2%}".format(accdropMSS))
test_predictionsSS = clf.predict(X_Test)
acc2dropMSS = accuracy_score(y_Test, test_predictionsSS)
print('Model accuracySS: {:.2%} '.format(acc2dropMSS))
print('ROC just use this to check overfitting: \n')
#If both curves are not too far from each other indicates (Train above test) that there is little overfitting, if the roc of the train is much better that the test and both curves are far from each other, then it´s overfiting. Otherwise underfitting.
RocScoreSS=roc_auc_score(y_Test, test_predictionsSS)
fprBSS, tprBSS, thresholdsBSS = roc_curve(y_Test, clf.predict_proba(X_Test)[:,1])
RocScoreTrainSS=roc_auc_score(y_Train, clf.predict(X_Train))
fprBTrainSS, tprBTrainSS, thresholdsBTrainSS = roc_curve(y_trainSS, clf.predict_proba(X_trainSS)[:,1])
OverfittingRoc=float(format(RocScoreTrainSS-RocScoreSS,'.2f'))
print RocScoreSS
print RocScoreTrainSS
plt.figure()
plt.plot(fprBSS, tprBSS, label='classifiersTest' % RocScoreSS)
plt.plot(fprBTrainSS, tprBTrainSS, label='classifiersTrain' % RocScoreTrainSS)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('\n----------------Useful Scores: loss and cost-benefit scores\n')
test_predictionsSS = clf.predict_proba(X_Test)
lldropMSS = log_loss(y_Test, test_predictionsSS)
print("Log LossdropMSS: {:.2f}".format(lldropMSS))
test_predictionsSS = clf.predict(X_Test)
# This function returns a score of the mean square difference between the actual outcome and the predicted probability of the possible outcome. The actual outcome has to be 1 or 0 (true or false), while the predicted probability of the actual outcome can be a value between 0 and 1.The brier score loss is also between 0 to 1 and the lower the score (the mean square difference is smaller), the more accurate the prediction is. It can be thought of as a measure of the “calibration” of a set of probabilistic predictions.
BLossS= brier_score_loss(y_Test, test_predictionsSS)
print("Brier score loss: {:.2f}".format(BLossS))
# test_predictionsdropM = clf.predict(X_TestdropM)
confusiondropMSS = metrics.confusion_matrix(y_Test, test_predictionsSS)
TPdropMSS = confusiondropMSS[0, 0]
TNdropMSS = confusiondropMSS[1, 1]
FPdropMSS = confusiondropMSS[1, 0]
FNdropMSS = confusiondropMSS[0, 1]
BenefitItemdropMSS = TNdropMSS
BenefitCodropMSS = TNdropMSS / (TNdropMSS + FPdropMSS) # this is specificity in statistics
print("BenefitItemdropMSS: {}".format(BenefitItemdropMSS))
print("BenefitCodropMSS: {:.2%}".format(BenefitCodropMSS))
CostItemdropMSS = (TNdropMSS + FNdropMSS)
CostCodropMSS = (TNdropMSS + FNdropMSS) / (TPdropMSS + TNdropMSS + FPdropMSS +FNdropMSS)
print("CostItemdropMSS: {}".format(CostItemdropMSS))
print("CostCodropMSS: {:.2%}".format(CostCodropMSS))
ImproveRatiodropMSS = (BenefitItemdropMSS/CostItemdropMSS) /((TNdropMSS+FPdropMSS) /(TPdropMSS + TNdropMSS + FPdropMSS+FNdropMSS))
print("ImproveRatio: {:.2%}".format(ImproveRatiodropMSS))
#scenario BenefitItem*price-CostItem*
balancetradeoffradiodropMSS = float(format(CostItemdropMSS/BenefitItemdropMSS , '.2f'))
print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropMSS)+CEND)
ProfitratiodropMSS = float(format(BenefitItemdropMSS/CostItemdropMSS, '.2f'))
print(CBLUE+"balancetradeoffradio: {0:.2f}".format(balancetradeoffradiodropMSS)+CEND)
print(CBLUE+"ProfitratiodropM: {0:.2f}".format(ProfitratiodropMSS)+CEND)
print('\n----------------Scenario analysis shreshold: marketing and controlling strategies\n')
# #Scenario: give a budget and the revenue, then I can predict, how much you can spend on each costItem or how many costItem can be spent
Budget = 200000
#One scenario considers benefitgoal, one scenario considers the balance, no deficit
ProfitGoal= 20000
# Proft maybe 700 every year, the profit of every Customer is the Reveneue from each customer minus the cost of the customer management
ProfitPerBenefitItem = 700
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit, then the insurance company should be:\n\n')
# within this budget, the smallest amount of target audience should be reached, in order to keep the balance of the account
# Min CostItem
MinTargetAudienceBdropMSS= int((Budget/ProfitPerBenefitItem)*balancetradeoffradiodropMSS)
#For every Target Audience, how much the company can spend for max. so that the company can keep the balance and avoid deficit
MaxBudgetPerTargetAudienceBdropMSS = float(format(Budget/MinTargetAudienceBdropMSS, '.2f'))
#To reach the benifit goal, at least MinTargetAudienceP should be reached
MinTargetAudiencePdropMSS = int(((Budget + ProfitGoal)/ProfitPerBenefitItem)*balancetradeoffradiodropMSS)
#To reach the benifit goal, at most MaxBudgetPerTargetAudienceP can be spent
MaxBudgetPerTargetAudiencePdropMSS = float(format(Budget/MinTargetAudiencePdropMSS, '.2f'))
print("MinTargetAudienceBdropMSS: {}".format(MinTargetAudienceBdropMSS))
print("MaxBudgetPerTargetAudienceBdropMSS: {0:.2f}".format(MaxBudgetPerTargetAudienceBdropMSS))
print('\nIn order to reach the profit goal, then the mininum target ordience number and the maximum budget for every target audience are:')
print("MinTargetAudiencePdropMSS: {}".format(MinTargetAudiencePdropMSS))
print("MaxBudgetPerTargetAudiencePdropMSS: {0:.2f}".format(MaxBudgetPerTargetAudiencePdropMSS)+"\n")
log_entryUdropMSS = pd.DataFrame([[namedropMSS, accdropMSS,acc2dropMSS]], columns=log_colsUdropMSS)#FPR*100,
logUdropMSS = logUdropMSS.append(log_entryUdropMSS,ignore_index=True)
log_entrydropMSS = pd.DataFrame([[namedropMSS,lldropMSS,OverfittingRoc,BLossS,BenefitItemdropMSS,BenefitCodropMSS, CostItemdropMSS, CostCodropMSS,ImproveRatiodropMSS,balancetradeoffradiodropMSS,ProfitratiodropMSS]], columns=log_colsdropMSS)#FPR*100,
logdropMSS = logdropMSS.append(log_entrydropMSS,ignore_index=True)
log_entryCBAdropMSS = pd.DataFrame([[namedropMSS,MinTargetAudienceBdropMSS,MaxBudgetPerTargetAudienceBdropMSS,MinTargetAudiencePdropMSS,MaxBudgetPerTargetAudiencePdropMSS]], columns=log_colsCBAdropMSS)
logCBAdropMSS = logCBAdropMSS.append(log_entryCBAdropMSS,ignore_index=True)
reportdropMSS = classification_report(y_Test, test_predictionsSS)
print(reportdropMSS)
confusion_matricesdropMSS = [
( "", confusion_matrix(y_Test, test_predictionsSS))
]
draw_confusion_matricesdropMSS(confusion_matricesdropMSS,class_namesdropMSS)
predictions = clf.predict(OutputdropM2)
print predictions
#new_column = df['Classifiers']
# pd.DataFrame(predictions).to_csv('C:\Users\chenp\Desktop\output.csv', index = False,header = False, sep=',', mode='a',encoding ='utf-8')
with open('C:\Users\chenp\Desktop\output.KFold.4.3.2.csv', 'a') as csvfile:#, newline=''
fwriter = csv.writer(csvfile, delimiter=',',quotechar='/',quoting=csv.QUOTE_MINIMAL)#,
fwriter.writerow(predictions)
#def color_negative_red(lldropM2):
# """
# Takes a scalar and returns a string with
# the css property `'color: red'` for negative
# strings, black otherwise.
# """
# color = 'red' if lldropM2 > 0.7 else 'black'
# return 'color: %s' % color
logUdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
logdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')
logCBAdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
logselecUdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
logselecdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')
logCBAselecdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
logUdropM2.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
logdropM2.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')
logCBAdropM.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
print ('Model Evaluation ')
logUdropMSS.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
KFold is less perssimistic than holdout(Train_test_split), so the brier_score_loss is bigger than the Train_test_split.
logdropMSS.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
print ("Business Application of the Model: \n\nLet´s say that the insurance company has the budget of 200000 EUR per year for Caravan mobile home insurance marketing, the revenue from every caravan customer is 700 EUR every year, the profit goal of this year in the caravan mobile home ensurance is 20000 EUR. \nIn summary, the given KPIs of the insurance company are: " +CBEIGE+ "\n\nBudget = 200000 \nRevenuePerBenefitItem = 700 \nProfitGoal = 20000\n"+CEND+'\nThen to keep the account balance not to be deficit (MTAB and MBPTAB) and to reach the proft goal (MTAP and MBPTAP), then the insurance company should be:\n\n')
logCBAdropMSS.style.set_table_styles(
[{'selector': 'tr:hover',
'props': [('background-color', 'yellow')]}]
)
#log_colsselecdropM=["Classifier", "Log Loss","BI","BO", "CI", "CO", "IR","BTOR"]
#log_colsCBAselecdropM =["Classifier", "MTAB","MBPTAB","MTAP","MBPTAP"]
sns.set(rc={'figure.figsize':(7.27,5.27)})
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropM, color="g")
plt.xlabel('BI')
plt.title('Amount of Benefit Items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropM, color="r")
plt.xlabel('CI')
plt.title('Amount of cost items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropM, color="g")
plt.xlabel('BO %')
plt.title('Benefit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropM, color="b")
plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropM, color="g")
plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropM, color="b")
plt.xlabel('Log Loss')
plt.title('Log Loss')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logselecdropM, color="g")
plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logselecdropM, color="r")
plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logselecdropM, color="g")
plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logselecdropM, color="b")
plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logselecdropM, color="g")
plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logselecdropM, color="b")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropM2, color="g")
plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropM2, color="r")
plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropM2, color="g")
plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropM2, color="b")
plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropM2, color="g")
plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropM2, color="b")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BI', y='Classifier', data=logdropMSS, color="g")
plt.xlabel('BI')
plt.title('Classifier Amount of Benefit Items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='CI', y='Classifier', data=logdropMSS, color="r")
plt.xlabel('CI')
plt.title('Classifier Amount of cost items')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='BO', y='Classifier', data=logdropMSS, color="g")
plt.xlabel('BO %')
plt.title('Classifier Benefit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='IR', y='Classifier', data=logdropMSS, color="b")
plt.xlabel('IR %')
plt.title('Improvement Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='PR', y='Classifier', data=logdropMSS, color="g")
plt.xlabel('PR %')
plt.title('Classifier Profit Ratio')
plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=logdropMSS, color="b")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()
#Train['C68APERSAUT'].value_counts().plot(kind='bar', color='steelblue', grid=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('count')
num_car_caravan = pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution of car policies')
plt.ylabel('caravan')
num_car_caravan = pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution of car policies')
plt.ylabel('caravan')
#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C47PPERSAUT'].value_counts()
print pd.crosstab(Train['C47PPERSAUT'], Train['C86CARAVAN'])
num_car_caravan = pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Purchasing power class')
plt.ylabel('caravan')
num_car_caravan = pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Purchasing power class')
plt.ylabel('caravan')
#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C43MKOOPKLA'].value_counts()
print pd.crosstab(Train['C43MKOOPKLA'], Train['C86CARAVAN'])
num_car_caravan = pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution private third party insurance')
plt.ylabel('caravan')
num_car_caravan = pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution private third party insurance')
plt.ylabel('caravan')
#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C44PWAPART'].value_counts()
print pd.crosstab(Train['C44PWAPART'], Train['C86CARAVAN'])
num_car_caravan = pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])
num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan_pct.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel(' Contribution boat policies')
plt.ylabel('caravan')
num_car_caravan = pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])
#num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
num_car_caravan.plot(kind='bar', stacked=True, color = ['steelblue', 'lightpink'], grid=True)
plt.xlabel('Contribution boat policies')
plt.ylabel('caravan')
#num_car_caravan = pd.crosstab(Train['C68APERSAUT'], Train['C86CARAVAN'])
##num_car_caravan_pct = num_car_caravan.div(num_car_caravan.sum(1).astype(float), axis=0)
#num_car_caravan.plot.pie(subplots=True)
#plt.xlabel('Number of car policies')
#plt.ylabel('caravan')
print Train['C61PPLEZIER'].value_counts()
print pd.crosstab(Train['C61PPLEZIER'], Train['C86CARAVAN'])